library(tidyverse)
library(cluster)
library(factoextra)
library(dendextend)
library(broom)
library(animation)
customers <- read_csv("mall_customers.csv") %>%
janitor::clean_names() %>%
rename(sex = gender)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## CustomerID = col_double(),
## Gender = col_character(),
## Age = col_double(),
## `Annual Income (k$)` = col_double(),
## `Spending Score (1-100)` = col_double()
## )
summary(customers)
## customer_id sex age annual_income_k
## Min. : 1.00 Length:200 Min. :18.00 Min. : 15.00
## 1st Qu.: 50.75 Class :character 1st Qu.:28.75 1st Qu.: 41.50
## Median :100.50 Mode :character Median :36.00 Median : 61.50
## Mean :100.50 Mean :38.85 Mean : 60.56
## 3rd Qu.:150.25 3rd Qu.:49.00 3rd Qu.: 78.00
## Max. :200.00 Max. :70.00 Max. :137.00
## spending_score_1_100
## Min. : 1.00
## 1st Qu.:34.75
## Median :50.00
## Mean :50.20
## 3rd Qu.:73.00
## Max. :99.00
customers <- customers %>%
select(-customer_id)
age_stats <- customers %>%
summarise(n = n(),
mean = mean(age),
sd = sd(age))
customers %>%
ggplot() +
aes(x = age) +
geom_histogram(aes(y = ..density..), colour = "white", bins = 25) +
stat_function(
fun = dnorm,
args = list(
mean = age_stats$mean,
sd = age_stats$sd
),
colour = "red"
)
customers %>%
select(age) %>%
mutate(under_40 = age < 40) %>%
ggplot() +
aes(x = under_40) +
geom_bar()
Age is fairly normally distributed, with more customers under 40
A small increase at roughly age 65 = recently retired?
annual_inc_stats <- customers %>%
summarise(n = n(),
mean = mean(annual_income_k),
sd = sd(annual_income_k))
customers %>%
ggplot() +
aes(x = annual_income_k) +
geom_histogram(aes(y = ..density..), colour = "white", bins = 25) +
stat_function(
fun = dnorm,
args = list(
mean = annual_inc_stats$mean,
sd = annual_inc_stats$sd
),
colour = "red"
) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
customers %>%
mutate(salary_range = case_when(
annual_income_k < 50 ~ "A_ under 50k",
annual_income_k >= 100 ~ "C_ over 100k",
T ~ "B_ 50 to 100k")) %>%
ggplot() +
aes(x = salary_range) +
geom_bar()
spending_score_1_100_stats <- customers %>%
summarise(n = n(),
mean = mean(spending_score_1_100),
sd = sd(spending_score_1_100))
customers %>%
ggplot() +
aes(x = spending_score_1_100) +
geom_histogram(aes(y = ..density..), colour = "white", bins = 25) +
stat_function(
fun = dnorm,
args = list(
mean = spending_score_1_100_stats$mean,
sd = spending_score_1_100_stats$sd
),
colour = "red"
) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
customers %>%
ggplot() +
aes(x = sex) +
geom_bar()
customers_scaled <- customers %>%
select(annual_income_k, spending_score_1_100) %>%
mutate(annual_income_k = scale(annual_income_k),
spending_score_1_100 = scale(spending_score_1_100))
fviz_nbclust(customers_scaled, kmeans, method = "wss", nstart = 25)
fviz_nbclust(customers_scaled, kmeans, method = "silhouette", nstart = 25)
* Suggests k = 5
fviz_nbclust(customers_scaled, kmeans, method = "gap_stat", nstart = 25, k.max = 10)
This says k = 1
Hmm beginning to think the data is not suited to clustering
I’ll choose k = 3 from the elbow chart
customers_3k <- kmeans(customers_scaled, centers = 3, nstart = 25)
customers_3k
## K-means clustering with 3 clusters of sizes 38, 123, 39
##
## Cluster means:
## annual_income_k spending_score_1_100
## 1 1.0066735 -1.22246770
## 2 -0.6246222 -0.01435636
## 3 0.9891010 1.23640011
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
## [149] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1
## [186] 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
##
## Within cluster sum of squares by cluster:
## [1] 20.81189 116.44835 19.65525
## (between_SS / total_SS = 60.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
## repeating this using k = 2, produced very different clusters each time
## repeating with k = 3 gives the same/similar clusters each time
customers_scaled %>%
kmeans.ani(centers = 3)
customers_with_3k_clusters <- augment(customers_3k, customers)
customers_with_3k_clusters %>%
ggplot() +
aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = age) +
geom_point() +
scale_color_continuous(type = "viridis")
customers_with_3k_clusters %>%
ggplot() +
aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = sex) +
geom_point()
The clusters seems to be roughly equal male/female
Cluster 1 = Higher annual income >= 75k, low spending score < 50, noone over 60
Cluster 2 = Lower annual income <= 75, range of spending scores, seems to be 3 smaller clusters. Particualrly dense cluster earning between ~ 40 to 70k, with a spending score of 40 - 60 ish. Where most of the people over 65 years old, and also a lot of young people
Cluster 3 = high earners, high spenders, all under 40
lets try k = 5
customers_5k <- kmeans(customers_scaled, centers = 5, nstart = 25)
customers_5k
## K-means clustering with 5 clusters of sizes 22, 23, 39, 35, 81
##
## Cluster means:
## annual_income_k spending_score_1_100
## 1 -1.3262173 1.12934389
## 2 -1.3042458 -1.13411939
## 3 0.9891010 1.23640011
## 4 1.0523622 -1.28122394
## 5 -0.2004097 -0.02638995
##
## Clustering vector:
## [1] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [38] 1 2 1 2 1 2 5 2 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [75] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [112] 5 5 5 5 5 5 5 5 5 5 5 5 3 4 3 5 3 4 3 4 3 5 3 4 3 4 3 4 3 4 3 5 3 4 3 4 3
## [149] 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
## [186] 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3
##
## Within cluster sum of squares by cluster:
## [1] 5.217630 7.577407 19.655252 18.304646 14.485632
## (between_SS / total_SS = 83.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# gets same/similar clusters every time
customers_scaled %>%
kmeans.ani(centers = 5)
customers_with_5k_clusters <- augment(customers_5k, customers)
customers_with_5k_clusters %>%
ggplot() +
aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = age) +
geom_point() +
scale_color_continuous(type = "viridis")
customers_with_5k_clusters %>%
ggplot() +
aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = sex) +
geom_point()
Looks like 5 pretty distinct clusters.
Cluster 1 = low earner low spenders
Cluster 2 = High earner low spender
Cluster 3 = High earner high spender
Cluster 4 = 50/50
Cluster 5 = Low earner High spender
I’d say the data seems well suited to clustering where k = 5.